https://www.datacamp.com/courses/machine-learning-in-the-tidyverse

nest()

# install.packages('dslabs')
library(dslabs)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Эта функция создает новый df, где колонка data содержит все данные для переменной по которой группируем.

# Explore gapminder
head(gapminder)
##               country year infant_mortality life_expectancy fertility
## 1             Albania 1960           115.40           62.87      6.19
## 2             Algeria 1960           148.20           47.50      7.65
## 3              Angola 1960           208.00           35.98      7.32
## 4 Antigua and Barbuda 1960               NA           62.97      4.43
## 5           Argentina 1960            59.87           65.39      3.11
## 6             Armenia 1960               NA           66.86      4.55
##   population          gdp continent          region
## 1    1636054           NA    Europe Southern Europe
## 2   11124892  13828152297    Africa Northern Africa
## 3    5270844           NA    Africa   Middle Africa
## 4      54681           NA  Americas       Caribbean
## 5   20619075 108322326649  Americas   South America
## 6    1867396           NA      Asia    Western Asia
# Prepare the nested dataframe gap_nested
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ readr   1.3.1
## ✔ tibble  2.1.3     ✔ purrr   0.3.2
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ ggplot2 3.2.1     ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
gap_nested <- gapminder %>% 
  group_by(country) %>% 
  nest()

# Explore gap_nested
head(gap_nested)
## # A tibble: 6 x 2
## # Groups:   country [185]
##   country                       data
##   <fct>               <list<df[,8]>>
## 1 Albania                   [57 × 8]
## 2 Algeria                   [57 × 8]
## 3 Angola                    [57 × 8]
## 4 Antigua and Barbuda       [57 × 8]
## 5 Argentina                 [57 × 8]
## 6 Armenia                   [57 × 8]

C помощью функции unnest() можно вернуться к изначальному df Фнукция identical() сравнивает 2 объекта

# Create the unnested dataframe called gap_unnnested
gap_unnested <- gap_nested %>% 
                unnest(data)
  
# Confirm that your data was not modified  
identical(gap_unnested, gapminder)
## [1] FALSE

К колонке data можно обращаться следующим образом То есть вытаскивать df и работать с ним как с обычным

# Extract the data of Algeria
algeria_df <- gap_nested$data[[1]]
# Calculate the minimum of the population vector
min(algeria_df$population)
## [1] NA
# Calculate the maximum of the population vector
max(algeria_df$population)
## [1] NA
# Calculate the mean of the population vector
mean(algeria_df$population)
## [1] NA

map()

Можно высчитывать не только среднее для одного df, а сразу для всех с помощью map

# Calculate the mean population for each country
pop_nested <- gap_nested %>%
  mutate(mean_pop = map(data, ~mean(.x$population)))

# Take a look at pop_nested
head(pop_nested)
## # A tibble: 6 x 3
## # Groups:   country [185]
##   country                       data mean_pop 
##   <fct>               <list<df[,8]>> <list>   
## 1 Albania                   [57 × 8] <dbl [1]>
## 2 Algeria                   [57 × 8] <dbl [1]>
## 3 Angola                    [57 × 8] <dbl [1]>
## 4 Antigua and Barbuda       [57 × 8] <dbl [1]>
## 5 Argentina                 [57 × 8] <dbl [1]>
## 6 Armenia                   [57 × 8] <dbl [1]>
# Extract the mean_pop value by using unnest
pop_mean <- pop_nested %>% 
  unnest(mean_pop)

# Take a look at pop_mean
head(pop_mean)
## # A tibble: 6 x 3
## # Groups:   country [185]
##   country                       data mean_pop
##   <fct>               <list<df[,8]>>    <dbl>
## 1 Albania                   [57 × 8]       NA
## 2 Algeria                   [57 × 8]       NA
## 3 Angola                    [57 × 8]       NA
## 4 Antigua and Barbuda       [57 × 8]       NA
## 5 Argentina                 [57 × 8]       NA
## 6 Armenia                   [57 × 8]       NA

map возращает лись, поэтому приходится применять unnest() Чтобы избежать этого, можно сразу использовать семейство функций map_*(map_dbl)

# Calculate mean population and store result as a double
pop_mean <- gap_nested %>%
  mutate(mean_pop = map_dbl(data, ~mean(.x$population)))

# Take a look at pop_mean
head(pop_mean)
## # A tibble: 6 x 3
## # Groups:   country [185]
##   country                       data mean_pop
##   <fct>               <list<df[,8]>>    <dbl>
## 1 Albania                   [57 × 8]       NA
## 2 Algeria                   [57 × 8]       NA
## 3 Angola                    [57 × 8]       NA
## 4 Antigua and Barbuda       [57 × 8]       NA
## 5 Argentina                 [57 × 8]       NA
## 6 Armenia                   [57 × 8]       NA

С помощью map можно засовывать результаты линейной регрессии

# Build a linear model for each country
gap_models <- gap_nested %>%
    mutate(model = map(data, ~lm(formula = life_expectancy~year, data = .x)))
    
# Extract the model for Algeria    
algeria_model <- gap_models$model[[1]]

# View the summary for the Algeria model
summary(algeria_model)
## 
## Call:
## lm(formula = life_expectancy ~ year, data = .x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7990 -0.3172 -0.1635  0.5574  1.1488 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.974e+02  1.236e+01  -32.14   <2e-16 ***
## year         2.362e-01  6.219e-03   37.99   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7724 on 55 degrees of freedom
## Multiple R-squared:  0.9633, Adjusted R-squared:  0.9626 
## F-statistic:  1443 on 1 and 55 DF,  p-value: < 2.2e-16

пакет broom

есть три основные функции

tidy()

Информация о коэффициентах

library(broom)

# Extract the coefficients of the algeria_model as a dataframe
tidy(algeria_model)
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept) -397.     12.4         -32.1 2.48e-37
## 2 year           0.236   0.00622      38.0 3.72e-41

glance()

Информация о статистиках(R^2)

# Extract the statistics of the algeria_model as a dataframe
glance(algeria_model)
## # A tibble: 1 x 11
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <int>  <dbl> <dbl> <dbl>
## 1     0.963         0.963 0.772     1443. 3.72e-41     2  -65.1  136.  142.
## # … with 2 more variables: deviance <dbl>, df.residual <int>

augment()

Информация о реальных значениях, о предсказанных об ошибках

algeria_fitted <- augment(algeria_model)

График линии модели и данных

library(ggplot2)
algeria_fitted %>% 
  ggplot(mapping = aes(x = year)) +
  geom_point(mapping = aes(y = life_expectancy)) +
  geom_line(mapping = aes(y = .fitted), color = "red")

tidy + map

Можно применить функцию tidy для всех стран и записать их в табличку

# Extract the coefficient statistics of each model into nested dataframes
model_coef_nested <- gap_models %>% 
    mutate(coef = map(model, ~tidy(.x)))
    
# Simplify the coef dataframes for each model    
model_coef <- model_coef_nested %>%
    unnest(coef)

После чего можно работать с этой табличкой Например нарисовать гистограмму распределения коэффициента при переменной year

model_coef %>% 
  filter(term == "year") %>% 
  ggplot(aes(x = estimate)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

glance + map

Получим статистики по всем странам

# Extract the fit statistics of each model into dataframes
model_perf_nested <- gap_models %>% 
    mutate(fit = map(model, ~glance(.x)))

# Simplify the fit dataframes for each model    
model_perf <- model_perf_nested %>% 
    unnest(fit)
    
# Look at the first six rows of model_perf
head(model_perf)
## # A tibble: 6 x 14
## # Groups:   country [185]
##   country     data model r.squared adj.r.squared sigma statistic  p.value
##   <fct>   <list<d> <lis>     <dbl>         <dbl> <dbl>     <dbl>    <dbl>
## 1 Albania [57 × 8] <lm>      0.963         0.963 0.772    1443.  3.72e-41
## 2 Algeria [57 × 8] <lm>      0.938         0.937 2.53      828.  7.84e-35
## 3 Angola  [57 × 8] <lm>      0.989         0.989 0.698    5091.  6.69e-56
## 4 Antigu… [57 × 8] <lm>      0.938         0.937 0.977     828.  7.64e-35
## 5 Argent… [57 × 8] <lm>      0.983         0.982 0.479    3103.  4.58e-50
## 6 Armenia [57 × 8] <lm>      0.288         0.275 1.57       22.2 1.70e- 5
## # … with 6 more variables: df <int>, logLik <dbl>, AIC <dbl>, BIC <dbl>,
## #   deviance <dbl>, df.residual <int>

Потом можно уже работать с этой табличкой

Построить гистограмму R2

model_perf %>% 
  ggplot(aes(x = r.squared)) + 
  geom_histogram()  
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Выбрать топ-4 лудших и худших моделей

# Extract the 4 best fitting models
best_fit <- model_perf %>% 
  top_n(n = 4, wt = r.squared)

# Extract the 4 models with the worst fit
worst_fit <- model_perf %>% 
  top_n(n = 4, wt = -r.squared)

augment + map

Это дает возможность легко рисовать графики для всех стран

сделаем таблички для лучших и худших стран

best_augmented <- best_fit %>% 
  # Build the augmented dataframe for each country model
  mutate(augmented = map(.x = model, ~augment(.x))) %>% 
  # Expand the augmented dataframes
  unnest(augmented)

worst_augmented <- worst_fit %>% 
  # Build the augmented dataframe for each country model
  mutate(augmented = map(.x = model, ~augment(.x))) %>% 
  # Expand the augmented dataframes
  unnest(augmented)

Графики для лучших

# Compare the predicted values with the actual values of life expectancy 
# for the top 4 best fitting models
best_augmented %>% 
  ggplot(aes(x = year)) +
  geom_point(aes(y = life_expectancy)) + 
  geom_line(aes(y = .fitted), color = "red") +
  facet_wrap(~country, scales = "free_y")

Графики для худших

# Compare the predicted values with the actual values of life expectancy 
# for the top 4 worst fitting models
worst_augmented %>% 
  ggplot(aes(x = year)) +
  geom_point(aes(y = life_expectancy)) + 
  geom_line(aes(y = .fitted), color = "red") +
  facet_wrap(~country, scales = "free_y")

Улучшение качества моделей

Сделаем не парную а мультирегерссию

# Build a linear model for each country using all features
gap_fullmodel <- gap_nested %>% 
  mutate(model = map(data, ~lm(formula = life_expectancy ~ ., data = .x)))
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]): contrasts can be applied only to factors with 2 or more levels
fullmodel_perf <- gap_fullmodel %>% 
  # Extract the fit statistics of each model into dataframes
  mutate(fit = map(model, ~glance(.x))) %>% 
  # Simplify the fit dataframes for each model
  unnest(fit)
## Error in eval(lhs, parent, parent): объект 'gap_fullmodel' не найден
# View the performance for the four countries with the worst fitting 
# four simple models you looked at before
fullmodel_perf %>% 
  filter(country %in% worst_fit$country) %>% 
  select(country, adj.r.squared)
## Error in eval(lhs, parent, parent): объект 'fullmodel_perf' не найден

train-test split + cross validation

Все функции для деления в пакете rsample

# install.packages('rsample')
library(rsample)

Деление на train и test в пропорции 3 к 4

set.seed(42)
# Prepare the initial split object
gap_split <- initial_split(gapminder, prop = 0.75)
# Extract the training dataframe
training_data <- training(gap_split)
# Extract the testing dataframe
testing_data <- testing(gap_split)
# Calculate the dimensions of both training_data and testing_data
dim(training_data)
## [1] 7909    9
dim(testing_data)
## [1] 2636    9

Теперь используем кросс валидацию

set.seed(42)

# Prepare the dataframe containing the cross validation partitions
cv_split <- vfold_cv(training_data, v = 5)

cv_data <- cv_split %>% 
  mutate(
    # Extract the train dataframe for each split
    train = map(splits, ~training(.x)), 
    # Extract the validate dataframe for each split
    validate = map(splits, ~testing(.x))
  )

# Use head() to preview cv_data
head(cv_data)
## # A tibble: 5 x 4
##   splits              id    train                validate            
## * <named list>        <chr> <named list>         <named list>        
## 1 <split [6.3K/1.6K]> Fold1 <df[,9] [6,327 × 9]> <df[,9] [1,582 × 9]>
## 2 <split [6.3K/1.6K]> Fold2 <df[,9] [6,327 × 9]> <df[,9] [1,582 × 9]>
## 3 <split [6.3K/1.6K]> Fold3 <df[,9] [6,327 × 9]> <df[,9] [1,582 × 9]>
## 4 <split [6.3K/1.6K]> Fold4 <df[,9] [6,327 × 9]> <df[,9] [1,582 × 9]>
## 5 <split [6.3K/1.6K]> Fold5 <df[,9] [6,328 × 9]> <df[,9] [1,581 × 9]>

Построение модели и расчет метрики

Обучаем модели на каждом фолде

# Build a model using the train data for each fold of the cross validation
cv_models_lm <- cv_data %>% 
  mutate(model = map(.x = train, ~lm(formula = life_expectancy ~ ., data = .x)))

Вытаскиваем настоящие значения и делаем прогноз

cv_prep_lm <- cv_models_lm %>% 
  mutate(
    # Extract the recorded life expectancy for the records in the validate dataframes
    validate_actual = map(validate, ~.x$life_expectancy),
    # Predict life expectancy for each validate set using its corresponding model
    validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y))
  )
## Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels): factor country has new levels Aruba, French Polynesia, Greenland, Hong Kong, China, Macao, China, New Caledonia, Puerto Rico

разные метрики есть в пакете Metrics

# install.packages("Metrics")
library(Metrics)

считаем метрику для каждого фолда, а потом среднее

library(Metrics)
# Calculate the mean absolute error for each validate fold       
cv_eval_lm <- cv_prep_lm %>% 
  mutate(validate_mae = map2_dbl(validate_actual, validate_predicted, ~mae(actual = .x, predicted = .y)))
## Error in eval(lhs, parent, parent): объект 'cv_prep_lm' не найден
# Print the validate_mae column
cv_eval_lm$validate_mae
## Error in eval(expr, envir, enclos): объект 'cv_eval_lm' не найден
# Calculate the mean of validate_mae column
mean(cv_eval_lm$validate_mae)
## Error in mean(cv_eval_lm$validate_mae): объект 'cv_eval_lm' не найден

Random Forest

находится в пакете ranger

# install.packages('ranger')
library(ranger)

Обучим и посмотрим

# Build a random forest model for each fold
cv_models_rf <- cv_data %>% 
  mutate(model = map(train, ~ranger(formula = life_expectancy~., data = .x,
                                    num.trees = 100, seed = 42)))
## Error: Missing data in columns: infant_mortality, fertility, population, gdp.
# Generate predictions using the random forest model
cv_prep_rf <- cv_models_rf %>% 
  mutate(validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y)$predictions))
## Error in eval(lhs, parent, parent): объект 'cv_models_rf' не найден

теперь померием метрику

# Calculate validate MAE for each fold
cv_eval_rf <- cv_prep_rf %>% 
  mutate(validate_mae = map2_dbl(validate_actual, validate_predicted, ~mae(actual = .x, predicted = .y)))
## Error in eval(lhs, parent, parent): объект 'cv_prep_rf' не найден
# Print the validate_mae column
print(cv_eval_rf$validate_mae)
## Error in print(cv_eval_rf$validate_mae): объект 'cv_eval_rf' не найден
# Calculate the mean of validate_mae column
mean(cv_eval_rf$validate_mae)
## Error in mean(cv_eval_rf$validate_mae): объект 'cv_eval_rf' не найден

она меньше чем у LR

Тюним модель по параметру mtry

# Prepare for tuning your cross validation folds by varying mtry
cv_tune <- cv_data %>% 
  crossing(mtry = 2:5) 

# Build a model for each fold & mtry combination
cv_model_tunerf <- cv_tune %>% 
  mutate(model = map2(.x = train, .y = mtry, ~ranger(formula = life_expectancy~., 
                                           data = .x, mtry = .y, 
                                           num.trees = 100, seed = 42)))
## Error: Missing data in columns: infant_mortality, fertility, population, gdp.

Делаем предикт и Смотрим для какого параметра модель лучше

# Generate validate predictions for each model
cv_prep_tunerf <- cv_model_tunerf %>% 
  mutate(validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y)$predictions))
## Error in eval(lhs, parent, parent): объект 'cv_model_tunerf' не найден
# Calculate validate MAE for each fold and mtry combination
cv_eval_tunerf <- cv_prep_tunerf %>% 
  mutate(validate_mae = map2_dbl(.x = validate_actual, .y = validate_predicted, ~mae(actual = .x, predicted = .y)))
## Error in eval(lhs, parent, parent): объект 'cv_prep_tunerf' не найден
# Calculate the mean validate_mae for each mtry used  
cv_eval_tunerf %>% 
  group_by(mtry) %>% 
  summarise(mean_mae = mean(validate_mae))
## Error in eval(lhs, parent, parent): объект 'cv_eval_tunerf' не найден

Для параметра 4!

Поняли, какая модель лучшая, поэтому обучаем ее на все train и делаем predict на test

best_model <- ranger(formula = life_expectancy~., data = training_data,
                     mtry = 4, num.trees = 100, seed = 42)
## Error: Missing data in columns: infant_mortality, fertility, population, gdp.
test_actual <- testing_data$life_expectancy
test_predict <- predict(best_model, testing_data)$predictions
## Error in predict(best_model, testing_data): объект 'best_model' не найден
mae(test_actual, test_predict)
## Error in ae(actual, predicted): объект 'test_predict' не найден

Задача классификации